Test and fix the bugfixes. (#86)

James Robinson · web-flow · commit 003065b5dbfb · 2023-02-15T13:03:34.000-05:00
* Test and fix the bugfixes.
* Better test over the input fed into databricks-connect, proving newlines and no preamble whitespace that a human would not have entered.
diff --git a/noteable_magics/datasource_postprocessing.py b/noteable_magics/datasource_postprocessing.py
@@ -2,7 +2,7 @@
 import shutil
 from base64 import b64decode
 from pathlib import Path
-from subprocess import PIPE, Popen
+from subprocess import PIPE, Popen, TimeoutExpired
 from tempfile import NamedTemporaryFile
 from typing import Any, Callable, Dict
 from urllib.parse import quote_plus, urlparse
@@ -264,21 +264,27 @@ def postprocess_awsathena(
     create_engine_kwargs['s3_staging_dir'] = quote_plus(create_engine_kwargs['s3_staging_dir'])
 
 
+DATABRICKS_CONNECT_SCRIPT_TIMEOUT = 10  # seconds
+
+
 @register_postprocessor('databricks+connector')
 def postprocess_databricks(
     datasource_id: str, dsn_dict: Dict[str, str], create_engine_kwargs: Dict[str, Any]
 ) -> None:
     """ENG-5517: If cluser_id is present, and `databricks-connect` is in the path, then
     set up and run it.
 
-    Also be sure to purge cluster_id, org_id, port from create_engine_kwargs, in that these
-    fields were added for only going into this side effect."""
+    Also be sure to purge cluster_id, org_id, port from connect_args portion of create_engine_kwargs,
+    in that these fields were added for only going into this side effect."""
 
     cluster_id_key = 'cluster_id'
     connect_file_opt_keys = [cluster_id_key, 'org_id', 'port']
 
     # Collect data to drive databricks-connect if we've got a cluster_id and script is in $PATH.
     connect_args = create_engine_kwargs['connect_args']
+    # Only wanted for getting connect_args. Any additional dereferencing is a bug.
+    del create_engine_kwargs
+
     if cluster_id_key in connect_args and shutil.which('databricks-connect'):
         # host, token (actually, our password field) come from dsn_dict.
         # (and what databricks-connect wants as 'host' is actually a https:// URL. Sigh.)
@@ -297,16 +303,28 @@ def postprocess_databricks(
             connect_file_path.unlink()
 
         p = Popen(['databricks-connect', 'configure'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
-        _stdout, stderr = p.communicate(input=f"""y
+        try:
+            _stdout, stderr = p.communicate(
+                # Indention fugly so as to not prefix each input with whitespace.
+                # And oh, be sure to have a newline betwen each input into the 'interactive' script.
+                input=f"""y
 {args['host']}
 {args['token']}
 {args[cluster_id_key]}
 {args['org_id']}
-{args['port']}""".encode(), timeout=10)
+{args['port']}""".encode(),
+                timeout=DATABRICKS_CONNECT_SCRIPT_TIMEOUT,
+            )
+        except TimeoutExpired:
+            raise ValueError(
+                f'databricks-connect took longer than {DATABRICKS_CONNECT_SCRIPT_TIMEOUT} seconds to complete.'
+            )
 
         if p.returncode != 0:
             # Failed to exectute the script. Raise an exception.
-            raise ValueError("Failed to execute databricks-connect configure script: " + stderr)
+            raise ValueError(
+                "Failed to execute databricks-connect configure script: " + stderr.decode()
+            )
 
     # Always be sure to purge these only-for-databricks-connect file args from connect_args,
     # even if not all were present.
diff --git a/tests/test_datasources.py b/tests/test_datasources.py
@@ -3,7 +3,7 @@
 import json
 import os
 from pathlib import Path
-from typing import Callable, List
+from typing import Callable, List, Tuple
 from uuid import uuid4
 
 import pkg_resources
@@ -554,6 +554,7 @@ class TestDatabricks:
 
     @pytest.fixture()
     def tmp_home(self, tmpdir: Path) -> Path:
+        """Replace $HOME to be a new directory of $TMPDIR, yielding the new Path."""
         existing_home = os.environ['HOME']
 
         new_home = tmpdir / 'home'
@@ -568,10 +569,13 @@ def tmp_home(self, tmpdir: Path) -> Path:
             os.environ['HOME'] = existing_home
 
     @pytest.fixture()
-    def databricks_connect_in_path(self, tmpdir: Path) -> Path:
-        # Get a mock-ish executable 'databricks-connect' into an element in the path
-        # so that which('databricks-connect') will find something (see databricks post
-        # processor)
+    def databricks_connect_in_path(self, tmpdir: Path) -> Tuple[Path, Path]:
+        """Get a mock-ish executable 'databricks-connect' into an element in the path
+        so that which('databricks-connect') will find something (see databricks post
+        processor)
+
+        Yields the new executable's path, plus where it will scribble its own output.
+        """
 
         # Make a new subdir of tmpdir, add it to the path, create executable
         # shell script databricks-connect
@@ -593,15 +597,14 @@ def databricks_connect_in_path(self, tmpdir: Path) -> Path:
         scriptpath.chmod(0o755)
 
         try:
-            # Yield the script output path so a test can inspect its contents.
-            yield script_output_path
+            yield scriptpath, script_output_path
 
         finally:
             # Undo $PATH change
             os.environ['PATH'] = orig_path
 
     @pytest.fixture()
-    def jsons_for_extra_behavior(self):
+    def jsons_for_extra_behavior(self) -> Tuple[DatasourceJSONs, dict]:
         """Return a DatasourceJSONs describing databricks that will tickle postprocess_databricks()
         into doing its extra behavior. Also returns dict of some of the fields within that JSON."""
 
@@ -643,6 +646,100 @@ def jsons_for_extra_behavior(self):
             },
         )
 
+    def test_postprocess_databricks_pops_correctly(self, datasource_id, jsons_for_extra_behavior):
+        """Ensure that postprocess_databricks side effect pops from the correct dict (connect_args,
+        not the containing create_engine_kwargs dict), even w/o databricks-connect
+        being found in the $PATH.
+        """
+
+        keys_expected_to_be_removed = ['cluster_id', 'org_id', 'port']
+        jsons_obj, specific_fields = jsons_for_extra_behavior
+        connect_args = jsons_obj.connect_args_dict
+
+        # All initially there...
+        assert all(key in connect_args for key in keys_expected_to_be_removed)
+
+        create_engine_kwargs = {'connect_args': connect_args}
+
+        datasource_postprocessing.postprocess_databricks(
+            datasource_id,
+            jsons_obj.dsn_dict,
+            create_engine_kwargs,
+        )
+
+        # Should have removed all the keys as side effect of the call.
+        # (Had bug where they were popped from wrong dict originally.)
+        assert not any(key in connect_args for key in keys_expected_to_be_removed)
+
+    def test_errors_from_databricks_connect_are_surfaced(
+        datasource_id, databricks_connect_in_path, tmp_home, jsons_for_extra_behavior
+    ):
+        """Prove that if databricks-connect script exits nonzero, a ValueError is raised
+        and the script's stderr will be within the error message."""
+
+        # Respell the databricks-connect script to always error out, expect that in a ValueError
+        # when calling postprocess_databricks
+
+        script_path, _ = databricks_connect_in_path
+
+        expected_error_message = 'oh noes!'
+
+        # Respell the script to bomb out with message to stderr.
+        with script_path.open('w') as of:
+            of.write('#!/bin/sh\n')
+            of.write(f'echo "{expected_error_message}" 1>&2\n')
+            of.write('exit 1\n')
+
+        jsons_obj, specific_fields = jsons_for_extra_behavior
+        create_engine_kwargs = {'connect_args': jsons_obj.connect_args_dict}
+
+        with pytest.raises(ValueError, match=expected_error_message):
+            datasource_postprocessing.postprocess_databricks(
+                datasource_id,
+                jsons_obj.dsn_dict,
+                create_engine_kwargs,
+            )
+
+    @pytest.fixture()
+    def short_script_timeout(self):
+        """Respell datasource_postprocessing.DATABRICKS_CONNECT_SCRIPT_TIMEOUT to 1 (second)"""
+        original_value = datasource_postprocessing.DATABRICKS_CONNECT_SCRIPT_TIMEOUT
+
+        datasource_postprocessing.DATABRICKS_CONNECT_SCRIPT_TIMEOUT = 1
+
+        try:
+            yield datasource_postprocessing.DATABRICKS_CONNECT_SCRIPT_TIMEOUT
+        finally:
+            datasource_postprocessing.DATABRICKS_CONNECT_SCRIPT_TIMEOUT = original_value
+
+    def test_databricks_connect_taking_too_long(
+        datasource_id, databricks_connect_in_path, short_script_timeout, jsons_for_extra_behavior
+    ):
+        """Prove that if databricks-connect takes longer than allowed to run, that ValueError will
+        be raised with an appropriate message.
+        """
+
+        # Respell the databricks-connect script to take longer than short_script_timeout seconds,
+        # expect that in a ValueError when calling postprocess_databricks.
+
+        script_path, _ = databricks_connect_in_path
+
+        # Respell the script to take longer than new timeout, but to (try to) exit cleanly
+        with script_path.open('w') as of:
+            of.write('#!/bin/sh\n')
+            of.write(f'sleep {short_script_timeout+1}\n')
+            of.write('exit 0\n')
+
+        jsons_obj, specific_fields = jsons_for_extra_behavior
+        create_engine_kwargs = {'connect_args': jsons_obj.connect_args_dict}
+
+        with pytest.raises(ValueError, match='databricks-connect took longer than'):
+            datasource_postprocessing.postprocess_databricks(
+                datasource_id,
+                jsons_obj.dsn_dict,
+                create_engine_kwargs,
+            )
+
     def test_extra_behavior(
         self, datasource_id, databricks_connect_in_path, tmp_home, jsons_for_extra_behavior
     ):
@@ -673,13 +770,14 @@ def test_extra_behavior(
         # databricks_connect_in_path will create a different file.
         assert not dotconnect.exists()
 
-        # databricks_connect_in_path is the path where the fake script output was placed
-        assert databricks_connect_in_path.exists()
+        # databricks_connect_in_path second member is the path where the fake script output was placed
+        _, script_output = databricks_connect_in_path
+        assert script_output.exists()
 
         # Expect to find things in it. See ENG-5517.
         # We can only test that we ran this mock script and the known result
         # of our mock script. What the real one does ... ?
-        contents = databricks_connect_in_path.read().split()
+        contents = script_output.read().split('\n')
         assert len(contents) == 6
         assert contents[0] == 'y'
         assert contents[1] == f"https://{case_dict['hostname']}/"