diff --git a/.github/workflows/toolforge-check-lag.yml b/.github/workflows/toolforge-check-lag.yml index 34016c9..8efb306 100644 --- a/.github/workflows/toolforge-check-lag.yml +++ b/.github/workflows/toolforge-check-lag.yml @@ -12,7 +12,7 @@ jobs: strategy: max-parallel: 4 matrix: - tool: ["editgroups"] + tool: ["editgroups", "editgroups-commons"] steps: - name: Configure SSH key diff --git a/.github/workflows/toolforge-deploy.yml b/.github/workflows/toolforge-deploy.yml index 4296296..840c9c3 100644 --- a/.github/workflows/toolforge-deploy.yml +++ b/.github/workflows/toolforge-deploy.yml @@ -11,7 +11,7 @@ jobs: strategy: max-parallel: 4 matrix: - tool: ["editgroups"] + tool: ["editgroups", "editgroups-commons"] steps: - name: Configure SSH key diff --git a/deployment/celery-editgroups-commons.yaml b/deployment/celery-editgroups-commons.yaml new file mode 100644 index 0000000..d7f4ec3 --- /dev/null +++ b/deployment/celery-editgroups-commons.yaml @@ -0,0 +1,35 @@ +# Run Celery on kubernetes +apiVersion: apps/v1 +kind: Deployment +metadata: + name: editgroups-commons.celery.sh + namespace: tool-editgroups-commons + labels: + name: editgroups-commons.celery.sh + toolforge: tool +spec: + replicas: 1 + selector: + matchLabels: + name: editgroups-commons.celery.sh + toolforge: tool + template: + metadata: + labels: + name: editgroups-commons.celery.sh + toolforge: tool + spec: + containers: + - name: celery + image: docker-registry.tools.wmflabs.org/toolforge-python311-sssd-base:latest + command: [ "/data/project/editgroups-commons/www/python/src/tasks-commons.sh" ] + workingDir: /data/project/editgroups-commons/www/python/src + env: + - name: HOME + value: /data/project/editgroups-commons + imagePullPolicy: Always + resources: + requests: + memory: "512Mi" + limits: + memory: "1024Mi" diff --git a/docs/architecture.rst b/docs/architecture.rst index 5601bb9..8dc59ca 100644 --- a/docs/architecture.rst +++ b/docs/architecture.rst @@ -33,9 +33,9 @@ This is a fairly reliable endpoint which also lets us resume the stream from a r ingestion stopped for some reason. By default, the listener tries to resume listening from the date of the latest edit it has ingested. -This process can be invoked directly as a script:: +This process can be invoked directly as a Django management command:: - python listener.py + python3 manage.py listener It can be run as an `attached daemon to uwsgi `_. diff --git a/docs/install.rst b/docs/install.rst index 7f6c61b..ae169d7 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -33,7 +33,7 @@ start a local web server, you can then access your EditGroups instance at ``http By default, there will not be much to see as the database will be empty. To get some data in, you need to run the listener script, which reads the Wikidata event stream and populates the database:: - python listener.py + python3 manage.py listener You will also need to run Celery, which will periodically annotate edits which need inspection, as well as providing the undo functionality (if you have set up OAuth, see below):: @@ -130,7 +130,7 @@ Put the following content in ``~/www/python/uwsgi.ini``:: static-map = /static=/data/project/editgroups/www/python/src/static master = true - attach-daemon = /data/project/editgroups/www/python/venv/bin/python3 /data/project/editgroups/www/python/src/listener.py + attach-daemon = /data/project/editgroups/www/python/venv/bin/python3 /data/project/editgroups/www/python/src/manage.py listener and run ``./manage.py collectstatic`` in the ``~/www/python/src`` directory. The listener will be an attached dameon, restarting with webservice restart. diff --git a/listener.py b/listener.py deleted file mode 100755 index aa22919..0000000 --- a/listener.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -import os -import sys -from datetime import datetime -from datetime import timedelta - -""" -Amount of time to look back when restarting -the listener. This helps make sure that we don't -lose any edit when the listener is restarted. -""" -LOOKBEHIND_OFFSET = timedelta(minutes=5) - -if __name__ == '__main__': - os.environ.setdefault("DJANGO_SETTINGS_MODULE", "editgroups.settings") - import django - django.setup() - - from store.stream import WikiEditStream - from store.utils import grouper - from store.models import Edit - - print('Listening to edits...') - s = WikiEditStream() - utcnow = datetime.utcnow() - try: - latest_edit_seen = Edit.objects.order_by('-timestamp')[0].timestamp - fetch_from = latest_edit_seen - LOOKBEHIND_OFFSET - except IndexError: - fetch_from = None - print('Starting from offset %s' % fetch_from.isoformat() if fetch_from else 'now') - - for i, batch in enumerate(grouper(s.stream(fetch_from), 50)): - if i % 50 == 0: - print('batch %d' % i) - print(datetime.fromtimestamp(batch[0].get('timestamp'))) - sys.stdout.flush() - Edit.ingest_edits(batch) - - print('End of stream') - diff --git a/restart_celery.sh b/restart_celery.sh index a69accf..2946764 100755 --- a/restart_celery.sh +++ b/restart_celery.sh @@ -1,6 +1,15 @@ #!/bin/bash export GOMAXPROCS=1 -kubectl delete deployment editgroups.celery.sh ; +TOOLNAME=$(whoami | cut -d "." -f 2) +echo "Running as ${TOOLNAME}" + +if [[ "$TOOLNAME" == "editgroups" ]]; then + CELERY_FN=celery +else + CELERY_FN="celery-${TOOLNAME}" +fi + +kubectl delete deployment "${TOOLNAME}.celery.sh"; echo "Waiting for Celery to stop"; -sleep 45 ; -kubectl create -f /data/project/editgroups/www/python/src/deployment/celery.yaml && kubectl get pods +sleep 45; +kubectl create -f "/data/project/${TOOLNAME}/www/python/src/deployment/${CELERY_FN}.yaml" diff --git a/store/management/commands/listener.py b/store/management/commands/listener.py new file mode 100755 index 0000000..c542162 --- /dev/null +++ b/store/management/commands/listener.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +import sys +from datetime import datetime +from datetime import timedelta + +from django.core.management.base import BaseCommand + +from store.stream import WikiEditStream +from store.utils import grouper +from store.models import Edit + + +class Command(BaseCommand): + """ + Amount of time to look back when restarting + the listener. This helps make sure that we don't + lose any edit when the listener is restarted. + """ + + LOOKBEHIND_OFFSET = timedelta(minutes=5) + help = "Listens to edits with EventStream" + + def handle(self, *args, **options): + print("Listening to edits...") + s = WikiEditStream() + try: + latest_edit_seen = Edit.objects.order_by("-timestamp")[0].timestamp + fetch_from = latest_edit_seen - self.LOOKBEHIND_OFFSET + except IndexError: + fetch_from = None + offset = fetch_from.isoformat() if fetch_from else "now" + print("Starting from offset %s" % offset) + + for i, batch in enumerate(grouper(s.stream(fetch_from), 50)): + if i % 50 == 0: + print("batch %d" % i) + print(datetime.fromtimestamp(batch[0].get("timestamp"))) + sys.stdout.flush() + Edit.ingest_edits(batch) + + print("End of stream") diff --git a/tasks-commons.sh b/tasks-commons.sh new file mode 100755 index 0000000..de2a1f5 --- /dev/null +++ b/tasks-commons.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + + +VENV_DIR=/data/project/editgroups-commons/www/python/venv + +if [[ -f ${VENV_DIR}/bin/activate ]]; then + source ${VENV_DIR}/bin/activate +else + echo "Creating virtualenv" + rm -rf ${VENV_DIR} + pyvenv ${VENV_DIR} + source ${VENV_DIR}/bin/activate + echo "Installing requirements" + pip install -r requirements.txt +fi; +echo "Starting celery" +export C_FORCE_ROOT=True +/data/project/editgroups-commons/www/python/venv/bin/python3 /data/project/editgroups-commons/www/python/venv/bin/celery --app=editgroups-commons.celery:app worker -l INFO -B --concurrency=3 --max-memory-per-child=50000 +echo $? +echo "Celery done" +