hail interoperability example (#381)

williambrandler · web-flow · commit 351befb8ea16 · 2021-08-26T16:26:10.000-07:00
* add init script, job config, nb-test, notebooks, docs for hail interop

Signed-off-by: wbrandler &lt;william.brandler@databricks.com&gt;

* update hail notebook

Signed-off-by: wbrandler &lt;william.brandler@databricks.com&gt;
Signed-off-by: William Brandler &lt;William.Brandler@databricks.com&gt;

* fix hail notebook title

Signed-off-by: William Brandler &lt;William.Brandler@databricks.com&gt;

* remove source notebook and regenerate with gen-nb-src.py

Signed-off-by: William Brandler &lt;William.Brandler@databricks.com&gt;

* provide mapping for jobs config files

Signed-off-by: William Brandler &lt;William.Brandler@databricks.com&gt;

* define params for notebook mapping function

Signed-off-by: William Brandler &lt;William.Brandler@databricks.com&gt;

* define params for notebook mapping function

Signed-off-by: William Brandler &lt;William.Brandler@databricks.com&gt;

* remove boilerplate, change function

Signed-off-by: William Brandler &lt;William.Brandler@databricks.com&gt;

* define param

Signed-off-by: William Brandler &lt;William.Brandler@databricks.com&gt;
diff --git a/docs/dev/init-scripts/install-hail.sh b/docs/dev/init-scripts/install-hail.sh
@@ -0,0 +1,22 @@
+
+#!/bin/bash
+set -ex
+
+# Pick up user-provided environment variables, specifically HAIL_VERSION
+source /databricks/spark/conf/spark-env.sh
+
+/databricks/python/bin/pip install -U hail==$HAIL_VERSION
+hail_jar_path=$(find /databricks/python3 -name 'hail-all-spark.jar')
+cp $hail_jar_path /databricks/jars
+
+# Note: This configuration takes precedence since configurations are
+# applied in reverse-lexicographic order.
+cat <<HERE >/databricks/driver/conf/00-hail.conf
+[driver] {
+  "spark.kryo.registrator" = "is.hail.kryo.HailKryoRegistrator"
+  "spark.hadoop.fs.s3a.connection.maximum" = 5000
+  "spark.serializer" = "org.apache.spark.serializer.KryoSerializer"
+}
+HERE
+
+echo $?
diff --git a/docs/dev/jobs-config-hail.json b/docs/dev/jobs-config-hail.json
@@ -0,0 +1,30 @@
+{
+  "new_cluster": {
+    "spark_version": "8.2.x-scala2.12",
+    "node_type_id": "Standard_DS3_v2",
+    "num_workers": 2,
+    "spark_env_vars": {
+        "HAIL_VERSION": "0.2.65"
+    },
+    "init_scripts": [
+      {
+        "dbfs": {
+          "destination": "dbfs:/glow-init-scripts/install-hail.sh"
+        }
+      }
+    ]
+  },
+  "libraries": [
+    {
+      "maven": {
+        "coordinates": "io.projectglow:glow-spark3_2.12:1.0.1"
+      }
+    },
+    {
+      "pypi": {
+        "package": "glow.py==1.0.1"
+      }
+    }
+  ],
+  "timeout_seconds": 3600
+}
diff --git a/docs/dev/notebook-jobs-config-mapping.json b/docs/dev/notebook-jobs-config-mapping.json
@@ -0,0 +1,3 @@
+{
+  "etl/hail-interoperation": "docs/dev/jobs-config-hail.json"
+}
diff --git a/docs/dev/run-nb-test.py b/docs/dev/run-nb-test.py
@@ -16,17 +16,27 @@
 import time
 import uuid
 
-JOBS_JSON = 'docs/dev/jobs-config.json'
+NOTEBOOK_JOBS_JSON_MAPPING = 'docs/dev/notebook-jobs-config-mapping.json'
 INIT_SCRIPT_DIR = 'docs/dev/init-scripts'
 
 
 def run_cli_cmd(cli_profile, api, args):
     cmd = ['databricks', '--profile', cli_profile, api] + args
     res = subprocess.run(cmd, capture_output=True)
-    if res.returncode is not 0:
+    if res.returncode != 0:
         raise ValueError(res)
     return res.stdout
 
+def get_jobs_config(d, key, jobs_path="docs/dev/jobs-config.json"):
+    """
+    :param d: dictionary with mapping of notebooks to databricks jobs configuration (from NOTEBOOK_JOBS_JSON_MAPPING)
+    :param key: notebook (nb) name
+    :jobs_path: path to default jobs configuration to test notebooks
+    """
+    if key in d:
+         jobs_path = d[key] 
+    print("running notebook " + key + " with the following jobs configuration json " + jobs_path)
+    return jobs_path
 
 @click.command()
 @click.option('--cli-profile', default='DEFAULT', help='Databricks CLI profile name.')
@@ -39,8 +49,8 @@ def run_cli_cmd(cli_profile, api, args):
 def main(cli_profile, workspace_tmp_dir, dbfs_init_script_dir, source_dir, nbs):
     identifier = str(uuid.uuid4())
     work_dir = os.path.join(workspace_tmp_dir, identifier)
-    with open(JOBS_JSON, 'r') as f:
-        jobs_json = json.load(f)
+    with open(NOTEBOOK_JOBS_JSON_MAPPING, 'r') as f:
+        notebook_jobs_json_mapping = json.load(f)
 
     if not nbs:
         nbs = [os.path.relpath(path, source_dir).split('.')[0]
@@ -58,6 +68,9 @@ def main(cli_profile, workspace_tmp_dir, dbfs_init_script_dir, source_dir, nbs):
 
         print(f"Launching runs")
         for nb in nbs:
+            jobs_json_path = get_jobs_config(notebook_jobs_json_mapping, nb)
+            with open(jobs_json_path, 'r') as f:
+                jobs_json = json.load(f)
             jobs_json['name'] = 'Glow notebook integration test - ' + nb
             jobs_json['notebook_task'] = {'notebook_path': work_dir + '/' + nb}
             run_submit = run_cli_cmd(cli_profile, 'runs', ['submit', '--json', json.dumps(jobs_json)])
diff --git a/docs/source/_static/notebooks/etl/hail-interoperation.html b/docs/source/_static/notebooks/etl/hail-interoperation.html
diff --git a/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/hail-interoperation.py b/docs/source/_static/zzz_GENERATED_NOTEBOOK_SOURCE/etl/hail-interoperation.py
@@ -0,0 +1,61 @@
+# Databricks notebook source
+import hail as hl
+hl.init(sc, idempotent=True, quiet=True)
+
+import glow
+spark = glow.register(spark)
+from glow.hail import functions
+
+from pyspark.sql.functions import *
+
+# COMMAND ----------
+
+vcf_path = '/databricks-datasets/hail/data-001/1kg_sample.vcf.bgz'
+out_path = 'dbfs:/tmp/1kg_sample.delta'
+
+# COMMAND ----------
+
+vcf_mt = hl.import_vcf(vcf_path)
+vcf_mt.show()
+
+# COMMAND ----------
+
+vcf_mt.count()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### convert to spark dataframe with glow schema
+
+# COMMAND ----------
+
+df = functions.from_matrix_table(vcf_mt, include_sample_ids=True)
+
+# COMMAND ----------
+
+df.printSchema()
+
+# COMMAND ----------
+
+df.write.format("delta").save(out_path)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ##### read back in, view and count dataframe
+
+# COMMAND ----------
+
+df2 = spark.read.format("delta").load(out_path)
+
+# COMMAND ----------
+
+display(df2)
+
+# COMMAND ----------
+
+df2.count()
+
+# COMMAND ----------
+
+dbutils.fs.rm(out_path, recurse=True)
diff --git a/docs/source/etl/hail.rst b/docs/source/etl/hail.rst
@@ -115,3 +115,6 @@ All of the other Glow DataFrame genotype fields are derived from the Hail Matrix
     - ``AD``
   * - ``<ANY_FIELD>``
     - ``<ANY_FIELD>``
+
+.. notebook:: .. etl/hail-interoperation.html
+  :title: Hail interoperation notebook

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "etl/hail-interoperation": "docs/dev/jobs-config-hail.json"`
	`3`	`+}`