Skip to content

Commit 351befb

Browse files
hail interoperability example (#381)
* add init script, job config, nb-test, notebooks, docs for hail interop Signed-off-by: wbrandler <william.brandler@databricks.com> * update hail notebook Signed-off-by: wbrandler <william.brandler@databricks.com> Signed-off-by: William Brandler <William.Brandler@databricks.com> * fix hail notebook title Signed-off-by: William Brandler <William.Brandler@databricks.com> * remove source notebook and regenerate with gen-nb-src.py Signed-off-by: William Brandler <William.Brandler@databricks.com> * provide mapping for jobs config files Signed-off-by: William Brandler <William.Brandler@databricks.com> * define params for notebook mapping function Signed-off-by: William Brandler <William.Brandler@databricks.com> * define params for notebook mapping function Signed-off-by: William Brandler <William.Brandler@databricks.com> * remove boilerplate, change function Signed-off-by: William Brandler <William.Brandler@databricks.com> * define param Signed-off-by: William Brandler <William.Brandler@databricks.com>
1 parent f4a0817 commit 351befb

File tree

7 files changed

+179
-4
lines changed

7 files changed

+179
-4
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
#!/bin/bash
3+
set -ex
4+
5+
# Pick up user-provided environment variables, specifically HAIL_VERSION
6+
source /databricks/spark/conf/spark-env.sh
7+
8+
/databricks/python/bin/pip install -U hail==$HAIL_VERSION
9+
hail_jar_path=$(find /databricks/python3 -name 'hail-all-spark.jar')
10+
cp $hail_jar_path /databricks/jars
11+
12+
# Note: This configuration takes precedence since configurations are
13+
# applied in reverse-lexicographic order.
14+
cat <<HERE >/databricks/driver/conf/00-hail.conf
15+
[driver] {
16+
"spark.kryo.registrator" = "is.hail.kryo.HailKryoRegistrator"
17+
"spark.hadoop.fs.s3a.connection.maximum" = 5000
18+
"spark.serializer" = "org.apache.spark.serializer.KryoSerializer"
19+
}
20+
HERE
21+
22+
echo $?

docs/dev/jobs-config-hail.json

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"new_cluster": {
3+
"spark_version": "8.2.x-scala2.12",
4+
"node_type_id": "Standard_DS3_v2",
5+
"num_workers": 2,
6+
"spark_env_vars": {
7+
"HAIL_VERSION": "0.2.65"
8+
},
9+
"init_scripts": [
10+
{
11+
"dbfs": {
12+
"destination": "dbfs:/glow-init-scripts/install-hail.sh"
13+
}
14+
}
15+
]
16+
},
17+
"libraries": [
18+
{
19+
"maven": {
20+
"coordinates": "io.projectglow:glow-spark3_2.12:1.0.1"
21+
}
22+
},
23+
{
24+
"pypi": {
25+
"package": "glow.py==1.0.1"
26+
}
27+
}
28+
],
29+
"timeout_seconds": 3600
30+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"etl/hail-interoperation": "docs/dev/jobs-config-hail.json"
3+
}

docs/dev/run-nb-test.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,27 @@
1616
import time
1717
import uuid
1818

19-
JOBS_JSON = 'docs/dev/jobs-config.json'
19+
NOTEBOOK_JOBS_JSON_MAPPING = 'docs/dev/notebook-jobs-config-mapping.json'
2020
INIT_SCRIPT_DIR = 'docs/dev/init-scripts'
2121

2222

2323
def run_cli_cmd(cli_profile, api, args):
2424
cmd = ['databricks', '--profile', cli_profile, api] + args
2525
res = subprocess.run(cmd, capture_output=True)
26-
if res.returncode is not 0:
26+
if res.returncode != 0:
2727
raise ValueError(res)
2828
return res.stdout
2929

30+
def get_jobs_config(d, key, jobs_path="docs/dev/jobs-config.json"):
31+
"""
32+
:param d: dictionary with mapping of notebooks to databricks jobs configuration (from NOTEBOOK_JOBS_JSON_MAPPING)
33+
:param key: notebook (nb) name
34+
:jobs_path: path to default jobs configuration to test notebooks
35+
"""
36+
if key in d:
37+
jobs_path = d[key]
38+
print("running notebook " + key + " with the following jobs configuration json " + jobs_path)
39+
return jobs_path
3040

3141
@click.command()
3242
@click.option('--cli-profile', default='DEFAULT', help='Databricks CLI profile name.')
@@ -39,8 +49,8 @@ def run_cli_cmd(cli_profile, api, args):
3949
def main(cli_profile, workspace_tmp_dir, dbfs_init_script_dir, source_dir, nbs):
4050
identifier = str(uuid.uuid4())
4151
work_dir = os.path.join(workspace_tmp_dir, identifier)
42-
with open(JOBS_JSON, 'r') as f:
43-
jobs_json = json.load(f)
52+
with open(NOTEBOOK_JOBS_JSON_MAPPING, 'r') as f:
53+
notebook_jobs_json_mapping = json.load(f)
4454

4555
if not nbs:
4656
nbs = [os.path.relpath(path, source_dir).split('.')[0]
@@ -58,6 +68,9 @@ def main(cli_profile, workspace_tmp_dir, dbfs_init_script_dir, source_dir, nbs):
5868

5969
print(f"Launching runs")
6070
for nb in nbs:
71+
jobs_json_path = get_jobs_config(notebook_jobs_json_mapping, nb)
72+
with open(jobs_json_path, 'r') as f:
73+
jobs_json = json.load(f)
6174
jobs_json['name'] = 'Glow notebook integration test - ' + nb
6275
jobs_json['notebook_task'] = {'notebook_path': work_dir + '/' + nb}
6376
run_submit = run_cli_cmd(cli_profile, 'runs', ['submit', '--json', json.dumps(jobs_json)])

docs/source/_static/notebooks/etl/hail-interoperation.html

Lines changed: 43 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Databricks notebook source
2+
import hail as hl
3+
hl.init(sc, idempotent=True, quiet=True)
4+
5+
import glow
6+
spark = glow.register(spark)
7+
from glow.hail import functions
8+
9+
from pyspark.sql.functions import *
10+
11+
# COMMAND ----------
12+
13+
vcf_path = '/databricks-datasets/hail/data-001/1kg_sample.vcf.bgz'
14+
out_path = 'dbfs:/tmp/1kg_sample.delta'
15+
16+
# COMMAND ----------
17+
18+
vcf_mt = hl.import_vcf(vcf_path)
19+
vcf_mt.show()
20+
21+
# COMMAND ----------
22+
23+
vcf_mt.count()
24+
25+
# COMMAND ----------
26+
27+
# MAGIC %md
28+
# MAGIC ##### convert to spark dataframe with glow schema
29+
30+
# COMMAND ----------
31+
32+
df = functions.from_matrix_table(vcf_mt, include_sample_ids=True)
33+
34+
# COMMAND ----------
35+
36+
df.printSchema()
37+
38+
# COMMAND ----------
39+
40+
df.write.format("delta").save(out_path)
41+
42+
# COMMAND ----------
43+
44+
# MAGIC %md
45+
# MAGIC ##### read back in, view and count dataframe
46+
47+
# COMMAND ----------
48+
49+
df2 = spark.read.format("delta").load(out_path)
50+
51+
# COMMAND ----------
52+
53+
display(df2)
54+
55+
# COMMAND ----------
56+
57+
df2.count()
58+
59+
# COMMAND ----------
60+
61+
dbutils.fs.rm(out_path, recurse=True)

docs/source/etl/hail.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,6 @@ All of the other Glow DataFrame genotype fields are derived from the Hail Matrix
115115
- ``AD``
116116
* - ``<ANY_FIELD>``
117117
- ``<ANY_FIELD>``
118+
119+
.. notebook:: .. etl/hail-interoperation.html
120+
:title: Hail interoperation notebook

0 commit comments

Comments
 (0)