feat: add support for sparksql magic (#137)

fangyh20 · web-flow · commit 274f0135d2f6 · 2025-09-24T06:32:08.000-07:00
diff --git a/DEVELOPING.md b/DEVELOPING.md
@@ -39,6 +39,28 @@ env \
   pytest --tb=auto -v
 ```
 
+## Testing with Magic Support
+
+To run tests with magic functionality, install the required dependencies manually:
+
+```sh
+pip install .
+pip install IPython sparksql-magic
+```
+
+Then run tests as normal. Any magic-related tests will automatically detect and use the available dependencies.
+
+## Testing without Magic Support
+
+To run tests without the magic dependencies, simply install the base package:
+
+```sh
+pip install .
+pytest
+```
+
+Tests that require magic functionality will be automatically skipped if the dependencies are not available.
+
 The integration tests in particular can take a while to run. To speed up the
 testing cycle, you can run them in parallel. You can do so using the `xdist`
 plugin by setting the `-n` flag to the number of parallel runners you want to
diff --git a/README.md b/README.md
@@ -54,6 +54,53 @@ environment variables:
    spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
    ```
 
+### Using Spark SQL Magic Commands (Jupyter Notebooks)
+
+The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
+
+**Installation**: To use magic commands, install the required dependencies manually:
+```bash
+pip install dataproc-spark-connect
+pip install IPython sparksql-magic
+```
+
+1. Load the magic extension:
+   ```python
+   %load_ext sparksql_magic
+   ```
+
+2. Configure default settings (optional):
+   ```python
+   %config SparkSql.limit=20
+   ```
+
+3. Execute SQL queries:
+   ```python
+   %%sparksql
+   SELECT * FROM your_table
+   ```
+
+4. Advanced usage with options:
+   ```python
+   # Cache results and create a view
+   %%sparksql --cache --view result_view df
+   SELECT * FROM your_table WHERE condition = true
+   ```
+
+Available options:
+- `--cache` / `-c`: Cache the DataFrame
+- `--eager` / `-e`: Cache with eager loading
+- `--view VIEW` / `-v VIEW`: Create a temporary view
+- `--limit N` / `-l N`: Override default row display limit
+- `variable_name`: Store result in a variable
+
+See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
+
+**Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
+```bash
+pip install dataproc-spark-connect
+```
+
 ## Developing
 
 For development instructions see [guide](DEVELOPING.md).
diff --git a/google/cloud/dataproc_spark_connect/session.py b/google/cloud/dataproc_spark_connect/session.py
@@ -559,6 +559,13 @@ def getOrCreate(self) -> "DataprocSparkSession":
                 session = self._get_exiting_active_session()
                 if session is None:
                     session = self.__create()
+
+                # Register this session as the instantiated SparkSession for compatibility
+                # with tools and libraries that expect SparkSession._instantiatedSession
+                from pyspark.sql import SparkSession as PySparkSQLSession
+
+                PySparkSQLSession._instantiatedSession = session
+
                 return session
 
         def _handle_custom_session_id(self):
@@ -1162,6 +1169,20 @@ def stop(self) -> None:
                     )
 
                 self._remove_stopped_session_from_file()
+
+                # Clean up SparkSession._instantiatedSession if it points to this session
+                try:
+                    from pyspark.sql import SparkSession as PySparkSQLSession
+
+                    if PySparkSQLSession._instantiatedSession is self:
+                        PySparkSQLSession._instantiatedSession = None
+                        logger.debug(
+                            "Cleared SparkSession._instantiatedSession reference"
+                        )
+                except (ImportError, AttributeError):
+                    # PySpark not available or _instantiatedSession doesn't exist
+                    pass
+
                 DataprocSparkSession._active_s8s_session_uuid = None
                 DataprocSparkSession._active_s8s_session_id = None
                 DataprocSparkSession._active_session_uses_custom_id = False
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,9 +1,11 @@
 google-api-core>=2.19
 google-cloud-dataproc>=5.18
 ipython~=9.1
+ipywidgets>=8.0.0
 packaging>=20.0
 pyink~=24.0
 pyspark[connect]~=4.0.0
 setuptools>=72.0
+sparksql-magic>=0.0.3
 tqdm>=4.67
 websockets>=14.0
diff --git a/tests/integration/test_session.py b/tests/integration/test_session.py
@@ -79,7 +79,7 @@ def os_environment(auth_type, image_version, test_project, test_region):
         )
     os.environ["DATAPROC_SPARK_CONNECT_AUTH_TYPE"] = auth_type
     if auth_type == "END_USER_CREDENTIALS":
-        os.environ.pop("DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT")
+        os.environ.pop("DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT", None)
     # Add SSL certificate fix
     os.environ["SSL_CERT_FILE"] = certifi.where()
     os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
@@ -113,7 +113,11 @@ def session_template_controller_client(test_client_options):
 
 @pytest.fixture
 def connect_session(test_project, test_region, os_environment):
-    return DataprocSparkSession.builder.getOrCreate()
+    return (
+        DataprocSparkSession.builder.projectId(test_project)
+        .location(test_region)
+        .getOrCreate()
+    )
 
 
 @pytest.fixture
@@ -537,3 +541,83 @@ def test_session_id_validation_in_integration(
 
     # Should not raise an exception
     assert builder._custom_session_id == valid_id
+
+
+@pytest.mark.parametrize("auth_type", ["END_USER_CREDENTIALS"], indirect=True)
+def test_sparksql_magic_library_available(connect_session):
+    """Test that sparksql-magic library can be imported and loaded."""
+    pytest.importorskip(
+        "IPython", reason="IPython not available (install with magic extra)"
+    )
+    pytest.importorskip(
+        "sparksql_magic",
+        reason="sparksql-magic not available (install with magic extra)",
+    )
+
+    from IPython.terminal.interactiveshell import TerminalInteractiveShell
+
+    # Create real IPython shell
+    shell = TerminalInteractiveShell.instance()
+    shell.user_ns = {"spark": connect_session}
+
+    # Test that sparksql_magic can be loaded (this verifies the dependency works)
+    try:
+        shell.run_line_magic("load_ext", "sparksql_magic")
+        magic_loaded = True
+    except Exception as e:
+        magic_loaded = False
+        print(f"Failed to load sparksql_magic: {e}")
+
+    assert magic_loaded, "sparksql_magic should be available as a dependency"
+
+    # Test that DataprocSparkSession can execute SQL (ensuring basic compatibility)
+    result = connect_session.sql("SELECT 'integration_test' as test_column")
+    data = result.collect()
+    assert len(data) == 1
+    assert data[0]["test_column"] == "integration_test"
+
+
+@pytest.mark.parametrize("auth_type", ["END_USER_CREDENTIALS"], indirect=True)
+def test_sparksql_magic_with_dataproc_session(connect_session):
+    """Test that sparksql-magic works with registered DataprocSparkSession."""
+    pytest.importorskip(
+        "IPython", reason="IPython not available (install with magic extra)"
+    )
+    pytest.importorskip(
+        "sparksql_magic",
+        reason="sparksql-magic not available (install with magic extra)",
+    )
+
+    from IPython.terminal.interactiveshell import TerminalInteractiveShell
+
+    # Create real IPython shell (DataprocSparkSession is already registered globally)
+    shell = TerminalInteractiveShell.instance()
+
+    # Load the sparksql_magic extension
+    shell.run_line_magic("load_ext", "sparksql_magic")
+
+    # Test sparksql magic with SQL expressions (no variable capture to avoid namespace issues)
+    shell.run_cell_magic(
+        "sparksql",
+        "result_df",
+        """
+        SELECT 
+            10 * 5 as multiplication,
+            SQRT(16) as square_root,
+            CONCAT('Dataproc', '-', 'Spark') as joined_string
+        """,
+    )
+
+    # Verify the result is captured in the namespace
+    assert "result_df" in shell.user_ns
+    df = shell.user_ns["result_df"]
+    assert df is not None
+
+    # Verify the computed values
+    data = df.collect()
+    assert len(data) == 1
+    row = data[0]
+
+    assert row["multiplication"] == 50
+    assert row["square_root"] == 4.0
+    assert row["joined_string"] == "Dataproc-Spark"