Skip to content

Commit b838205

Browse files
author
James Robinson
authored
ENG-5517 special behavior for databricks. (#84)
1 parent e0445d5 commit b838205

File tree

3 files changed

+249
-3
lines changed

3 files changed

+249
-3
lines changed

noteable_magics/datasource_postprocessing.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import shutil
23
from base64 import b64decode
34
from pathlib import Path
45
from tempfile import NamedTemporaryFile
@@ -260,3 +261,47 @@ def postprocess_awsathena(
260261

261262
# 3. quote_plus s3_staging_dir
262263
create_engine_kwargs['s3_staging_dir'] = quote_plus(create_engine_kwargs['s3_staging_dir'])
264+
265+
266+
@register_postprocessor('databricks+connector')
267+
def postprocess_databricks(
268+
datasource_id: str, dsn_dict: Dict[str, str], create_engine_kwargs: Dict[str, Any]
269+
) -> None:
270+
"""ENG-5517: If cluser_id is present, and `databricks-connect` is in the path, then
271+
set up and run it.
272+
273+
Also be sure to purge cluster_id, org_id, port from create_engine_kwargs, in that these
274+
fields were added for only going into this side effect."""
275+
276+
cluster_id_key = 'cluster_id'
277+
connect_file_opt_keys = [cluster_id_key, 'org_id', 'port']
278+
279+
# Collect data to drive databricks-connect if we've got a cluster_id and script is in $PATH.
280+
connect_args = create_engine_kwargs['connect_args']
281+
if cluster_id_key in connect_args and shutil.which('databricks-connect'):
282+
# host, token (actually, our password field) come from dsn_dict.
283+
# (and what databricks-connect wants as 'host' is actually a https:// URL. Sigh.)
284+
args = {
285+
'host': f'https://{dsn_dict["host"]}/',
286+
'token': dsn_dict['password'],
287+
}
288+
for key in connect_file_opt_keys:
289+
if key in connect_args:
290+
args[key] = connect_args[key]
291+
292+
connect_file_path = Path(os.environ['HOME']) / '.databricks-connect'
293+
294+
# rm -f any preexisting file.
295+
if connect_file_path.exists():
296+
connect_file_path.unlink()
297+
298+
# Now let databricks-connect external command (re)build it and do whatever
299+
# else it does. See ENG-5517. The 'y' at the start accepts the license agreement.
300+
# (We've fallen oh so far from Don Libes' tcl Expect for stuff like this.)
301+
pipeline = f"echo y {args['host']} {args['token']} {args[cluster_id_key]} {args['org_id']} {args['port']} | databricks-connect configure"
302+
os.system(pipeline)
303+
304+
# Always be sure to purge these only-for-databricks-connect file args from create_engine_kwargs,
305+
# even if not all were present.
306+
for key in connect_file_opt_keys:
307+
create_engine_kwargs.pop(key, '')

noteable_magics/datasources.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def bootstrap_datasource(
7575

7676
create_engine_kwargs = {'connect_args': connect_args}
7777

78-
# 'drivername' comes in via metadata, because reasons.
78+
# 'drivername', and is really of form dialect(+drivername), comes in via metadata, because reasons.
7979
drivername = metadata['drivername']
8080
dsn_dict['drivername'] = drivername
8181

tests/test_datasources.py

Lines changed: 203 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
""" Tests over datasource bootstrapping """
22

33
import json
4+
import os
45
from pathlib import Path
56
from typing import Callable, List
67
from uuid import uuid4
@@ -341,7 +342,6 @@ def test_bootstrap_datasources(self, datasource_id_factory, tmp_path: Path):
341342
class TestBootstrapDatasource:
342343
@pytest.mark.parametrize('sample_name', SampleData.all_sample_names())
343344
def test_success(self, sample_name, datasource_id):
344-
345345
# Clear out connections at the onset, else the get_engine() portion
346346
# gets confused over human-name conflicts when we've bootstrapped
347347
# the same sample data repeatedly, namely through having first
@@ -547,6 +547,208 @@ def test_had_bad_sqlite_database_files(self, datasource_id, bad_pathname: str):
547547
# the real problem to the user.
548548

549549

550+
class TestDatabricks:
551+
"""Test ENG-5517 Very Special Behavior for databricks side-effects if databricks-connect script is
552+
in PATH and we have the optional 'cluster_id' datapoint in connect args.
553+
"""
554+
555+
@pytest.fixture()
556+
def tmp_home(self, tmpdir: Path) -> Path:
557+
existing_home = os.environ['HOME']
558+
559+
new_home = tmpdir / 'home'
560+
561+
new_home.mkdir()
562+
563+
os.environ['HOME'] = str(new_home)
564+
565+
try:
566+
yield new_home
567+
finally:
568+
os.environ['HOME'] = existing_home
569+
570+
@pytest.fixture()
571+
def databricks_connect_in_path(self, tmpdir: Path) -> Path:
572+
# Get a mock-ish executable 'databricks-connect' into an element in the path
573+
# so that which('databricks-connect') will find something (see databricks post
574+
# processor)
575+
576+
# Make a new subdir of tmpdir, add it to the path, create executable
577+
# shell script databricks-connect
578+
579+
bindir = tmpdir / 'scratch-bin'
580+
bindir.mkdir()
581+
582+
orig_path = os.environ['PATH']
583+
584+
os.environ['PATH'] = f"{orig_path}:{bindir}"
585+
586+
scriptpath = bindir / 'databricks-connect'
587+
script_output_path = tmpdir / 'connect-inputs.txt'
588+
589+
# Now make a 'databricks-connect' executable that echos all its stdin to tmpdir/connect-inputs.txt.txt.
590+
with open(scriptpath, 'w') as outfile:
591+
outfile.write(f'#!/bin/sh\ncat > {script_output_path}\nexit 0\n')
592+
593+
scriptpath.chmod(0o755)
594+
595+
try:
596+
# Yield the script output path so a test can inspect its contents.
597+
yield script_output_path
598+
599+
finally:
600+
# Undo $PATH change
601+
os.environ['PATH'] = orig_path
602+
603+
@pytest.fixture()
604+
def jsons_for_extra_behavior(self):
605+
"""Return a DatasourceJSONs describing databricks that will tickle postprocess_databricks()
606+
into doing its extra behavior. Also returns dict of some of the fields within that JSON."""
607+
608+
hostname = 'dbc-1bab80fc-a74b.cloud.databricks.com'
609+
password = 'dapie372d57cefdc078d8ce3936fcb0e22ee'
610+
port = 54321
611+
org_id = 65475674534576
612+
cluster_id = '0122-044839-vx2fk606'
613+
614+
case_data = DatasourceJSONs(
615+
meta_dict={
616+
'required_python_modules': ['sqlalchemy-databricks'],
617+
'allow_datasource_dialect_autoinstall': False,
618+
'drivername': 'databricks+connector',
619+
'sqlmagic_autocommit': False, # This one is special!
620+
'name': 'Databricks With Extras',
621+
},
622+
dsn_dict={
623+
'username': 'token',
624+
'host': hostname,
625+
'password': password,
626+
},
627+
connect_args_dict={
628+
"http_path": "sql/protocolv1/o/2414094324684936/0125-220758-m9pfb4c7",
629+
"cluster_id": cluster_id,
630+
"org_id": org_id,
631+
"port": port,
632+
},
633+
)
634+
635+
return (
636+
case_data,
637+
{
638+
'hostname': hostname,
639+
'password': password,
640+
'port': port,
641+
'org_id': org_id,
642+
'cluster_id': cluster_id,
643+
},
644+
)
645+
646+
def test_extra_behavior(
647+
self, datasource_id, databricks_connect_in_path, tmp_home, jsons_for_extra_behavior
648+
):
649+
"""Test creating databricks with extra keys to cause postprocess_databricks() to do its magic"""
650+
651+
# Make a preexisting tmp_home/.databricks-connect, expect it to get unlinked
652+
# (see lines in postprocess_databricks)
653+
dotconnect = tmp_home / '.databricks-connect'
654+
with dotconnect.open('w') as of:
655+
of.write('exists')
656+
657+
assert dotconnect.exists()
658+
659+
case_data, case_dict = jsons_for_extra_behavior
660+
661+
assert 'cluster_id' in case_data.connect_args_dict
662+
663+
initial_len = len(Connection.connections)
664+
665+
datasources.bootstrap_datasource(
666+
datasource_id, case_data.meta_json, case_data.dsn_json, case_data.connect_args_json
667+
)
668+
669+
assert len(Connection.connections) == initial_len + 1
670+
671+
# Preexisting file should have been unlinked. The real databricks-connect
672+
# script would have recreated it, but the mock version we create in fixture
673+
# databricks_connect_in_path will create a different file.
674+
assert not dotconnect.exists()
675+
676+
# databricks_connect_in_path is the path where the fake script output was placed
677+
assert databricks_connect_in_path.exists()
678+
679+
# Expect to find things in it. See ENG-5517.
680+
# We can only test that we ran this mock script and the known result
681+
# of our mock script. What the real one does ... ?
682+
contents = databricks_connect_in_path.read().split()
683+
assert len(contents) == 6
684+
assert contents[0] == 'y'
685+
assert contents[1] == f"https://{case_dict['hostname']}/"
686+
assert contents[2] == case_dict['password']
687+
assert contents[3] == case_dict['cluster_id']
688+
assert contents[4] == str(case_dict['org_id'])
689+
assert contents[5] == str(case_dict['port'])
690+
691+
def test_skip_extra_behavior_if_no_databricks_connect(
692+
self, datasource_id, tmp_home, jsons_for_extra_behavior
693+
):
694+
# Let's create a $HOME/.databricks-connect file. It should remain untouched
695+
# since we're not also using fixture databricks_connect_in_path putting the
696+
# script in our path.
697+
698+
dotconnect = tmp_home / '.databricks-connect'
699+
with dotconnect.open('w') as of:
700+
of.write('preexists')
701+
702+
assert dotconnect.exists()
703+
704+
case_data, case_dict = jsons_for_extra_behavior
705+
706+
initial_len = len(Connection.connections)
707+
708+
# Should not fail, but won't have done any extra behavior.
709+
datasources.bootstrap_datasource(
710+
datasource_id, case_data.meta_json, case_data.dsn_json, case_data.connect_args_json
711+
)
712+
713+
assert len(Connection.connections) == initial_len + 1
714+
715+
# Left unchanged
716+
assert dotconnect.exists()
717+
assert 'preexists' in dotconnect.read()
718+
719+
def test_skip_extra_behavior_if_no_cluster_id(
720+
self, datasource_id, tmp_home, databricks_connect_in_path
721+
):
722+
# Let's create a $HOME/.databricks-connect file. It should remain untouched
723+
# since we're not also using fixture databricks_connect_in_path putting the
724+
# script in our path.
725+
726+
dotconnect = tmp_home / '.databricks-connect'
727+
with dotconnect.open('w') as of:
728+
of.write('preexists')
729+
730+
assert dotconnect.exists()
731+
732+
case_data = SampleData.get_sample('databricks')
733+
assert 'cluster_id' not in case_data.connect_args_dict
734+
735+
initial_len = len(Connection.connections)
736+
737+
# Should not have triggered extra behavior -- cluster_id wasn't present (but
738+
# we do have a databricks-connect script in the PATH).
739+
740+
datasources.bootstrap_datasource(
741+
datasource_id, case_data.meta_json, case_data.dsn_json, case_data.connect_args_json
742+
)
743+
744+
assert len(Connection.connections) == initial_len + 1
745+
746+
# But won't have breathed on dotconnect file.
747+
# Left unchanged
748+
assert dotconnect.exists()
749+
assert 'preexists' in dotconnect.read()
750+
751+
550752
class TestEnsureRequirements:
551753
def test_already_installed(self, datasource_id):
552754
requirements = ['pip']
@@ -569,7 +771,6 @@ def test_raises_when_disallowed_but_needs_to_install(
569771

570772
class TestInstallPackage:
571773
def test_install(self, not_installed_package):
572-
573774
pkgname = not_installed_package
574775

575776
# Better not be installed right now!

0 commit comments

Comments
 (0)