diff --git a/.gitignore b/.gitignore index ddeef37..e141398 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,8 @@ node_modules /kiwix-tools bin/zimdump + +*.tfstate +*.tfstate.* +*.terraform +*.terraform.* diff --git a/Dockerfile b/Dockerfile index a78e2aa..92472e7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,28 +1,34 @@ -FROM debian:stable +# This Dockerfile creates a self-contained image in which mirrorzim.sh can be executed. +# It also runs ipfs daemon. +# +# You can build the image as follows (remember to use this repo as context for the build): +# docker build . --platform=linux/amd64 -f Dockerfile -t distributed-wikipedia-mirror +# +# You can then run the container anywhere as follows: +# docker run --ulimit nofile=65536:65536 -p 4001:4001/tcp -p 4001:4001/udp distributed-wikipedia-mirror -ENV DEBIAN_FRONTEND=noninteractive +FROM stedolan/jq:latest AS jq +FROM openzim/zim-tools:3.1.0 AS zimdump +FROM ipfs/go-ipfs:v0.12.0 AS ipfs +FROM node:16 -RUN apt update -RUN apt -y install --no-install-recommends git ca-certificates curl wget apt-utils +RUN apt-get update && apt-get install --no-install-recommends --assume-yes rsync moreutils -# install: -# - node and yarn -# - go-ipfs -RUN curl -sL https://deb.nodesource.com/setup_14.x -o nodesource_setup.sh \ - && bash nodesource_setup.sh \ - && apt -y install --no-install-recommends nodejs \ - && npm install -g yarn \ - && wget -nv https://dist.ipfs.io/go-ipfs/v0.8.0/go-ipfs_v0.8.0_linux-amd64.tar.gz \ - && tar xvfz go-ipfs_v0.8.0_linux-amd64.tar.gz \ - && mv go-ipfs/ipfs /usr/local/bin/ipfs \ - && rm -r go-ipfs && rm go-ipfs_v0.8.0_linux-amd64.tar.gz \ - && ipfs init -p server,local-discovery,flatfs,randomports --empty-repo \ - && ipfs config --json 'Experimental.ShardingEnabled' true +COPY --from=jq /usr/local/bin/jq /usr/local/bin/ +COPY --from=zimdump /usr/local/bin/zimdump /usr/local/bin/ +COPY --from=ipfs /usr/local/bin/ipfs /usr/local/bin/ -# TODO: move repo init after external volume is mounted +COPY assets /root/assets +COPY bin /root/bin +COPY src /root/src +COPY tools /root/tools +COPY mirrorzim.sh package.json tsconfig.json /root/ -ENV DEBIAN_FRONTEND=dialog +RUN mkdir /root/snapshots /root/tmp +RUN cd /root && yarn -RUN mkdir /root/distributed-wikipedia-mirror -VOLUME ["/root/distributed-wikipedia-mirror"] -WORKDIR /root/distributed-wikipedia-mirror +EXPOSE 4001/tcp +EXPOSE 4001/udp + +WORKDIR /root +ENTRYPOINT [ "tools/entrypoint.sh" ] diff --git a/README.md b/README.md index 58fe7ac..621a2af 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ This step won't be necessary when automatic sharding lands in go-ipfs (wip). ### Step 3: Download the latest snapshot from kiwix.org -Source of ZIM files is at https://download.kiwix.org/zim/wikipedia/ +Source of ZIM files is at https://download.kiwix.org/zim/wikipedia/ Make sure you download `_all_maxi_` snapshots, as those include images. To automate this, you can also use the `getzim.sh` script: @@ -164,8 +164,8 @@ $ zimdump dump ./snapshots/wikipedia_tr_all_maxi_2021-01.zim --dir ./tmp/wikiped > ### ℹ️ ZIM's main page > -> Each ZIM file has "main page" attribute which defines the landing page set for the ZIM archive. -> It is often different than the "main page" of upstream Wikipedia. +> Each ZIM file has "main page" attribute which defines the landing page set for the ZIM archive. +> It is often different than the "main page" of upstream Wikipedia. > Kiwix Main page needs to be passed in the next step, so until there is an automated way to determine "main page" of ZIM, you need to open ZIM in Kiwix reader and eyeball the name of the landing page. ### Step 5: Convert the unpacked zim directory to a website with mirror info @@ -242,7 +242,7 @@ Make sure at least two full reliable copies exist before updating DNSLink. ## mirrorzim.sh -It is possible to automate steps 3-6 via a wrapper script named `mirrorzim.sh`. +It is possible to automate steps 3-6 via a wrapper script named `mirrorzim.sh`. It will download the latest snapshot of specified language (if needed), unpack it, and add it to IPFS. To see how the script behaves try running it on one of the smallest wikis, such as `cu`: @@ -253,22 +253,26 @@ $ ./mirrorzim.sh --languagecode=cu --wikitype=wikipedia --hostingdnsdomain=cu.wi ## Docker build -A `Dockerfile` with all the software requirements is provided. -For now it is only a handy container for running the process on non-Linux -systems or if you don't want to pollute your system with all the dependencies. -In the future it will be end-to-end blackbox that takes ZIM and spits out CID -and repo. +A `Dockerfile` with all the software requirements is provided. +It is a handy container for running the process on non-Linux systems, if you don't want to pollute your system with all the dependencies or if you want to run the process in the cloud. +It is an end-to-end blackbox that takes mirrorzim.sh arguments, spits out CID and runs IPFS daemon. -To build the docker image: +To run the publicly available docker image: ```sh -docker build . -t distributed-wikipedia-mirror-build +docker run --ulimit nofile=65536:65536 -p 4001:4001/tcp -p 4001:4001/udp public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror:latest ``` -To use it as a development environment: +Alternatively, to build the docker image: ```sh -docker run -it -v $(pwd):/root/distributed-wikipedia-mirror --net=host --entrypoint bash distributed-wikipedia-mirror-build +docker build . --platform=linux/amd64 -f Dockerfile -t distributed-wikipedia-mirror +``` + +And then, to run it: + +```sh +docker run --ulimit nofile=65536:65536 -p 4001:4001/tcp -p 4001:4001/udp distributed-wikipedia-mirror ``` # How to Help diff --git a/mirrorzim.sh b/mirrorzim.sh index a80c6da..299a26e 100755 --- a/mirrorzim.sh +++ b/mirrorzim.sh @@ -11,19 +11,23 @@ usage() { echo "" echo "SYNOPSIS" echo " $0 --languagecode= --wikitype=" + echo " [--tag=]" + echo " [--edition=]" echo " [--hostingdnsdomain=]" echo " [--hostingipnshash=]" echo " [--mainpageversion=]" echo "" echo "OPTIONS" echo "" - echo " -l, --languagecode string - the language of the wikimedia property e.g. tr - turkish, en - english" - echo " -w, --wikitype string - the type of the wikimedia property e.g. wikipedia, wikiquote" - echo " -d, --hostingdnsdomain string - the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org" - echo " -i, --hostingipnshash string - the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W" - echo " -v, --mainpageversion string - an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id" - - exit 2 + echo " -l, --languagecode string - the language of the wikimedia property e.g. tr - turkish, en - english" + echo " -w, --wikitype string - the type of the wikimedia property e.g. wikipedia, wikiquote" + echo " -t, --tag string - the tag of the wikimedia property e.g. all, top (defaults to all)" + echo " -e, --edition string - the edition of the wikimedia property e.g. maxi, mini (defaults to maxi)" + echo " -c, --date string - the date of the wikimedia property e.g. latest (defaults to latest)" + echo " -d, --hostingdnsdomain string - the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org" + echo " -i, --hostingipnshash string - the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W" + echo " -v, --mainpageversion string - an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id" + exit 2 } @@ -38,6 +42,18 @@ case $i in WIKI_TYPE="${i#*=}" shift ;; + -t=*|--tag=*) + TAG="${i#*=}" + shift + ;; + -e=*|--edition=*) + EDITION="${i#*=}" + shift + ;; + -c=*|--date=*) + DATE="${i#*=}" + shift + ;; -d=*|--hostingdnsdomain=*) HOSTING_DNS_DOMAIN="${i#*=}" shift @@ -70,6 +86,18 @@ if [ -z ${WIKI_TYPE+x} ]; then usage fi +if [ -z ${TAG+x} ]; then + TAG="all" +fi + +if [ -z ${EDITION+x} ]; then + EDITION="maxi" +fi + +if [ -z ${DATE+x} ]; then + DATE="latest" +fi + if [ -z ${HOSTING_DNS_DOMAIN+x} ]; then HOSTING_DNS_DOMAIN="" fi @@ -87,7 +115,7 @@ PATH=$PATH:$(realpath ./bin) which zimdump &> /dev/null || (curl --progress-bar -L https://download.openzim.org/release/zim-tools/zim-tools_linux-x86_64-3.0.0.tar.gz | tar -xvz --strip-components=1 -C ./bin zim-tools_linux-x86_64-3.0.0/zimdump && chmod +x ./bin/zimdump) printf "\nDownload and verify the zim file...\n" -ZIM_FILE_SOURCE_URL="$(./tools/getzim.sh download $WIKI_TYPE $WIKI_TYPE $LANGUAGE_CODE all maxi latest | grep 'URL:' | cut -d' ' -f3)" +ZIM_FILE_SOURCE_URL="$(./tools/getzim.sh download $WIKI_TYPE $WIKI_TYPE $LANGUAGE_CODE $TAG $EDITION $DATE | grep 'URL:' | cut -d' ' -f3)" ZIM_FILE=$(echo $ZIM_FILE_SOURCE_URL | rev | cut -d'/' -f1 | rev) TMP_DIRECTORY="./tmp/$(echo $ZIM_FILE | cut -d'.' -f1)" @@ -116,11 +144,8 @@ node ./bin/run $TMP_DIRECTORY \ ${HOSTING_IPNS_HASH:+--hostingipnshash=$HOSTING_IPNS_HASH} \ ${MAIN_PAGE_VERSION:+--mainpageversion=$MAIN_PAGE_VERSION} -printf "\n-------------------------\n" -printf "\nIPFS_PATH=$IPFS_PATH\n" - printf "\nAdding the processed tmp directory to IPFS\n(this part may take long time on a slow disk):\n" -CID=$(ipfs add -r --cid-version 1 --pin=false --offline -Qp $TMP_DIRECTORY) +CID=$(ipfs add -r --cid-version 1 --pin=false --offline -Q -p $TMP_DIRECTORY) MFS_DIR="/${ZIM_FILE}__$(date +%F_%T)" # pin by adding to MFS under a meaningful name diff --git a/src/site-transforms.ts b/src/site-transforms.ts index 8dfdc2d..1f49a84 100644 --- a/src/site-transforms.ts +++ b/src/site-transforms.ts @@ -322,7 +322,7 @@ export const useKiwixLandingPage = async ( } // Fixup relative paths, if needed - const depth = (options.kiwixMainPage.match(/\//g) || []).length + const depth = (kiwixMainPageSrc.substring(wikiFolder.length + 1).match(/\//g) || []).length if (depth) { const fixRelativeLinksUp = (filePath: string, depth: number) => { const fileBytes = readFileSync(filePath) diff --git a/terraform/README.md b/terraform/README.md new file mode 100644 index 0000000..7b2dd04 --- /dev/null +++ b/terraform/README.md @@ -0,0 +1,50 @@ +This directory contains 2 terraform configurations: +- `ec2`: for creating an EC2 instance with IPFS ports exposed to public and docker installed +- `ecr`: for creating a public ECR repository which can be used to store `distributed-wikipedia-mirror` images + +The terraform configurations expect the following environment variables: +- `AWS_REGION` (ec2 only): the region to create the resources in +- `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`: the credentials to the account to create the resources in + +###### How to publish a new distributed wikipedia mirror on a new instance? + +1. Create a new instance instance. + ```bash + cd terraform/ec2 + terraform apply + ``` +1. SSH to the newly created instance. The exact command will be printed as an output of `terraform apply`. + ```bash + ssh -i ec2-user@ + ``` +1. Create a new distributed wikipedia mirror. + ```bash + docker run --name wikipedia-on-ipfs --ulimit nofile=65536:65536 -d -p 4001:4001/tcp -p 4001:4001/udp public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror:latest + ``` +1. Find the CID of the newly created distributed wikipedia mirror. It might take a while for it to become available. + ```bash + docker logs wikipedia-on-ipfs + ``` + +###### How to create a new ECR repository? + +It will print out a bunch of useful commands that should be updated in the docs. +```bash +cd terraform/ecr +terraform apply +``` + +###### How to create a new docker image? + +1. Log in to the ECR. + ```bash + aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror + ``` +1. Build a new docker image. + ```bash + docker build . --platform=linux/amd64 -f Dockerfile -t public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror -t public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror:$(date -u +%F) -t public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror:$(date -u +%s) + ``` +1. Push the newly created docker image. + ```bash + docker push --all-tags public.ecr.aws/c4h1q7d1/distributed-wikipedia-mirror + ``` diff --git a/terraform/ec2/outputs.tf b/terraform/ec2/outputs.tf new file mode 100644 index 0000000..be3cf19 --- /dev/null +++ b/terraform/ec2/outputs.tf @@ -0,0 +1,3 @@ +output "ssh_command" { + value = "ssh -i ec2-user@${aws_instance.this.public_dns}" +} diff --git a/terraform/ec2/providers.tf b/terraform/ec2/providers.tf new file mode 100644 index 0000000..b21d3b6 --- /dev/null +++ b/terraform/ec2/providers.tf @@ -0,0 +1 @@ +provider "aws" {} diff --git a/terraform/ec2/resources.tf b/terraform/ec2/resources.tf new file mode 100644 index 0000000..02200ae --- /dev/null +++ b/terraform/ec2/resources.tf @@ -0,0 +1,99 @@ +data "aws_ami" "this" { + most_recent = true + + filter { + name = "name" + values = ["amzn2-ami-ecs-hvm-2.0.20220304-x86_64-ebs"] + } + + owners = ["591542846629"] +} + +resource "aws_security_group" "this" { + name = "wikipedia-on-ipfs" + + ingress { + description = "SSH Access" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + + ingress { + description = "TCP Transport" + from_port = 4001 + to_port = 4001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + + ingress { + description = "UDP Transport" + from_port = 4001 + to_port = 4001 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } +} + +resource "aws_key_pair" "this" { + key_name = "wikipedia-on-ipfs" + public_key = "${var.public_key}" + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } +} + +resource "aws_instance" "this" { + ami = data.aws_ami.this.id + # t3.small doesn't have enough memory + instance_type = "t3.medium" + key_name = "${aws_key_pair.this.key_name}" + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } + + root_block_device { + volume_size = var.volume_size + volume_type = "gp3" + iops = var.volume_iops + throughput = var.volume_throughput + + tags = { + Name = "wikipedia-on-ipfs" + Url = "https://github.com/ipfs/distributed-wikipedia-mirror" + } + } + + credit_specification { + cpu_credits = "standard" + } + + security_groups = ["${aws_security_group.this.name}"] + + user_data = join("\n", [ + "#!/bin/bash", + "sysctl -w net.core.rmem_max=2500000" + ]) +} diff --git a/terraform/ec2/terraform.tf b/terraform/ec2/terraform.tf new file mode 100644 index 0000000..4001c13 --- /dev/null +++ b/terraform/ec2/terraform.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + aws = { + version = "4.5.0" + } + } + + required_version = "~> 1.1.4" +} diff --git a/terraform/ec2/variables.tf b/terraform/ec2/variables.tf new file mode 100644 index 0000000..7d924de --- /dev/null +++ b/terraform/ec2/variables.tf @@ -0,0 +1,23 @@ +variable "public_key" { + description = "SSH public key." + type = string + sensitive = true +} + +variable "volume_size" { + description = "Root block device volume size." + type = number + default = 100 +} + +variable "volume_iops" { + description = "Root block device volume IOPS." + type = number + default = 3000 +} + +variable "volume_throughput" { + description = "Root block device volume throughput (MiB/s)." + type = number + default = 125 +} diff --git a/terraform/ecr/outputs.tf b/terraform/ecr/outputs.tf new file mode 100644 index 0000000..ed344a9 --- /dev/null +++ b/terraform/ecr/outputs.tf @@ -0,0 +1,15 @@ +output "docker_login_command" { + value = "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin ${aws_ecrpublic_repository.this.repository_uri}" +} + +output "docker_build_command" { + value = "docker build . --platform=linux/amd64 -f Dockerfile -t ${aws_ecrpublic_repository.this.repository_uri} -t ${aws_ecrpublic_repository.this.repository_uri}:$(date -u +%F) -t ${aws_ecrpublic_repository.this.repository_uri}:$(date -u +%s)" +} + +output "docker_push_command" { + value = "docker push --all-tags ${aws_ecrpublic_repository.this.repository_uri}" +} + +output "docker_run_command" { + value = "docker run --name wikipedia-on-ipfs --ulimit nofile=65536:65536 -d -p 4001:4001/tcp -p 4001:4001/udp ${aws_ecrpublic_repository.this.repository_uri}:latest " +} diff --git a/terraform/ecr/providers.tf b/terraform/ecr/providers.tf new file mode 100644 index 0000000..c125940 --- /dev/null +++ b/terraform/ecr/providers.tf @@ -0,0 +1,3 @@ +provider "aws" { + region = "us-east-1" +} diff --git a/terraform/ecr/resources.tf b/terraform/ecr/resources.tf new file mode 100644 index 0000000..58663b6 --- /dev/null +++ b/terraform/ecr/resources.tf @@ -0,0 +1,3 @@ +resource "aws_ecrpublic_repository" "this" { + repository_name = "distributed-wikipedia-mirror" +} diff --git a/terraform/ecr/terraform.tf b/terraform/ecr/terraform.tf new file mode 100644 index 0000000..4001c13 --- /dev/null +++ b/terraform/ecr/terraform.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + aws = { + version = "4.5.0" + } + } + + required_version = "~> 1.1.4" +} diff --git a/tools/entrypoint.sh b/tools/entrypoint.sh new file mode 100755 index 0000000..c0cf62d --- /dev/null +++ b/tools/entrypoint.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -eu + +ipfs init -p server,flatfs --empty-repo +ipfs config --json Experimental.AcceleratedDHTClient true +ipfs config --json 'Datastore.Spec.mounts' "$(ipfs config 'Datastore.Spec.mounts' | jq -c '.[0].child.sync=false')" + +./mirrorzim.sh "$@" | ts + +ipfs daemon diff --git a/tools/find_original_main_page_url.sh b/tools/find_original_main_page_url.sh deleted file mode 100755 index fe3ea17..0000000 --- a/tools/find_original_main_page_url.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# vim: set ts=2 sw=2: - -set -euo pipefail - -# Landing pages shipping with ZIM file are either truncated or Kiwix-specific. -# This script finds the URL of original version of the langing page -# mathing the timestamp of snapshot in unpacked ZIM directory - -usage() { - echo "USAGE:" - echo " $0
"; - echo "" - exit 2 -} - -if [ -z "${1-}" ]; then - echo "Missing main page name (eg. Main_Page.html) " - usage -fi - -if [ -z "${2-}" ]; then - echo "Missing unpacked zim dir (eg. ./out) " - usage -fi - -MAIN_PAGE=$1 -ZIM_ROOT=$2 - -SNAPSHOT_URL=$(grep -io 'https://[^"]*oldid=[^"]*' "$ZIM_ROOT/A/$MAIN_PAGE") - -echo $SNAPSHOT_URL