mytardis
diff --git a/‎.dockerignore‎
Lines changed: 4 additions & 0 deletions b/‎.dockerignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 138 additions & 0 deletions b/‎.gitignore‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 11 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 28 additions & 0 deletions b/‎README.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎index.py‎
Lines changed: 123 additions & 0 deletions b/‎index.py‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎kubernetes/README.md‎
Lines changed: 37 additions & 0 deletions b/‎kubernetes/README.md‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎kubernetes/configmap.yaml‎
Lines changed: 21 additions & 0 deletions b/‎kubernetes/configmap.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎kubernetes/cronjob-update.yaml‎
Lines changed: 28 additions & 0 deletions b/‎kubernetes/cronjob-update.yaml‎
Lines changed: 28 additions & 0 deletions
@@ -0,0 +1,4 @@
+kubernetes/
+Dockerfile
+settings.yaml
+README.md
@@ -0,0 +1,138 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
@@ -0,0 +1,11 @@
+FROM python:3.7
+
+ENV PYTHONUNBUFFERED 1
+
+COPY . /app
+WORKDIR /app
+
+RUN pip install --upgrade pip && \
+	pip install -r requirements.txt
+
+CMD ["python", "index.py"]
@@ -0,0 +1,28 @@
+## Data reporting for MyTardis
+
+This script will populate Elasticsearch initial index for reporting from MyTardis datafile records. It can also add new or update existing data for a time period.
+
+We will support MyTardis version 4.2+
+
+### Technical details
+
+Settings are available through default setting.yaml config file.
+
+You must specify credentials to the database and location of Elasticsearch server. You can increase number of rows fetched per single bulk call.
+
+Run from command line:
+
+```
+python index.py [--config CONFIG] [--days DAYS] [--rebuild]
+
+optional arguments:
+  --config CONFIG  Config file location.
+  --days DAYS      Populate past DAYS of data only, default is -1 to index all data.
+  --rebuild        Delete and create index.
+```
+
+### Docker and Kubernetes
+
+We build automatically latest version of Docker image and publish it on DockerHub with mytardis/es-reporting:latest image name.
+
+Sample files in [kubernetes](./kubernetes/) folder will provide you with example of running this tool in Kubernetes.
@@ -0,0 +1,123 @@
+import sys
+import argparse
+import os
+import yaml
+import json
+
+from psycopg2 import connect
+from psycopg2.extras import RealDictCursor
+from elasticsearch import Elasticsearch, helpers
+
+from reporting import count_from_db, data_from_db, get_extras, data_to_es
+
+
+def init_es_index(index_name):
+
+    with open("{}.json".format(index_name)) as f:
+        config = json.load(f)
+
+    es.indices.delete(
+        index=index_name,
+        ignore_unavailable=True
+    )
+
+    es.indices.create(
+        index=index_name,
+        body=config
+    )
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--config",
+    default="settings.yaml",
+    help="Config file location."
+)
+parser.add_argument(
+    "--days",
+    type=int,
+    default=1,
+    help="Populate past days of data."
+)
+parser.add_argument(
+    "--rebuild",
+    action="store_true",
+    help="Delete and create index."
+)
+
+args = parser.parse_args()
+
+if os.path.isfile(args.config):
+    with open(args.config) as f:
+        settings = yaml.load(f, Loader=yaml.Loader)
+else:
+    sys.exit("Can't find settings.")
+
+try:
+    con = connect(
+        host=settings["database"]["host"],
+        port=settings["database"]["port"],
+        user=settings["database"]["username"],
+        password=settings["database"]["password"],
+        database=settings["database"]["database"]
+    )
+except Exception:
+    sys.exit("Can't connect to the database.")
+
+try:
+    es_host = "{}:{}".format(
+        settings["elasticsearch"]["host"],
+        settings["elasticsearch"]["port"]
+    )
+    es = Elasticsearch([es_host])
+except Exception:
+    con.close()
+    sys.exit("Can't connect to the Elasticsearch.")
+
+cur = con.cursor(cursor_factory=RealDictCursor)
+
+if args.rebuild:
+    print("Rebuild index.")
+    init_es_index(settings["index"]["name"])
+
+start = 0
+to_go = 1
+cache = {}
+
+while to_go > 0:
+
+    to_go = count_from_db(cur, args.days, start)
+    print(
+        "{:,} datafileobjects to index, {:,} datasets cached"
+        .format(to_go, len(cache))
+    )
+
+    if to_go > 0:
+
+        rows = data_from_db(cur, args.days, start, settings["index"]["limit"])
+
+        dataset_ids = list(set([row["dataset_id"] for row in rows]))
+        extra_ds_ids = []
+        for ds_id in dataset_ids:
+            if ds_id not in cache:
+                extra_ds_ids.append(ds_id)
+        if len(extra_ds_ids) != 0:
+            extras = get_extras(cur, extra_ds_ids)
+            for k in extras:
+                cache[k] = extras[k]
+
+        data = []
+        for row in rows:
+            if row["dfo_id"] > start:
+                start = row["dfo_id"]
+            ds_id = row["dataset_id"]
+            if ds_id in cache:
+                data.append({**row, **cache[ds_id]})
+            else:
+                data.append(row)
+
+        helpers.bulk(es, data_to_es(settings["index"]["name"], data))
+
+print("Completed.")
+cur.close()
+con.close()
@@ -0,0 +1,37 @@
+## Kubernetes deployment
+
+1. Start with changing configMap values in `configmap.yaml` file according to your setup (namespaces, names, credentials).
+
+    Deploy config map:
+   `kubectl create -f configmap.yaml`
+
+
+2. Run initial job to build index and populate data.
+
+    `kubectl create -f job-create.yaml`
+
+    It will provide script with 2 optional arguments, --days=-1 to index all data and --rebuild to create index in Elasticsearch.
+
+3. Schedule cron job to update only last 24 hours of data.
+
+    `kubectl create -f cronjob-update.yaml`
+
+### Kibana
+
+You can deploy Kibana:
+
+`kubectl create -f kibana.yaml`
+
+and expose it to the public using (nginx) Ingress with username/password authentication.
+
+Firstly, generate auth file and load it as a secret:
+
+```
+htpasswd -c auth.txt mytardis
+kubectl -n mytardis create secret generic reporting --from-file=auth.txt
+rm auth.txt
+```
+
+Secondly, deploy ingress with annotation to use secret:
+
+`kubectl create -f ingress.yaml`
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: es-reporting
+  namespace: mytardis
+data:
+  settings.yaml: |
+    database:
+      host: pgbouncer.postgres.svc.cluster.local
+      port: 5432
+      username: user
+      password: pass
+      database: postgres
+
+    elasticsearch:
+      host: elasticsearch.mytardis.svc.cluster.local
+      port: 9200
+
+    index:
+      name: reporting
+      limit: 10000
@@ -0,0 +1,28 @@
+apiVersion: batch/v1beta1
+kind: CronJob
+metadata:
+  name: es-reporting-update
+  namespace: mytardis
+spec:
+  schedule: "15 0 * * *"
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          restartPolicy: OnFailure
+          containers:
+          - name: go
+            image: mytardis/es-reporting:latest
+            imagePullPolicy: Always
+            command:
+              - python
+              - index.py
+              - --days=1
+            volumeMounts:
+            - name: settings
+              mountPath: /app/settings.yaml
+              subPath: settings.yaml
+          volumes:
+            - name: settings
+              configMap:
+                name: es-reporting