Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions deepnote_toolkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
(".set_notebook_path", "set_notebook_path"),
(".sql.sql_execution", "execute_sql"),
(".sql.sql_execution", "execute_sql_with_connection_json"),
(".sql.sql_query_chaining", "register_sql_query"),
(".variable_explorer", "deepnote_export_df"),
(".variable_explorer", "deepnote_get_data_preview_json"),
(".variable_explorer", "get_var_list"),
Expand Down
57 changes: 49 additions & 8 deletions deepnote_toolkit/sql/sql_query_chaining.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,30 @@
from deepnote_toolkit.sql.query_preview import DeepnoteQueryPreview
from deepnote_toolkit.sql.sql_utils import is_single_select_query

# Maps a SQL block's result variable name to the source query that produced it.
# deepnote-internal calls register_sql_query after a dataframe-mode SQL block
# runs, so that subsequent SQL blocks can reference the variable as a table and
# have it expanded into a CTE. This is the same chaining DeepnoteQueryPreview
# enables for query-preview-mode blocks, but without changing the result type
# (which would show a preview banner and limit the row count).
_sql_query_registry: dict = {}


def register_sql_query(variable_name, query):
"""Register a SQL block's source query so downstream blocks can chain on it.

Only single SELECT statements can be expanded into a CTE, so anything else
(or an empty query) clears any previous entry for the variable rather than
storing it.
"""
if variable_name is None:
return

if query is not None and is_single_select_query(query):
_sql_query_registry[variable_name] = query
else:
_sql_query_registry.pop(variable_name, None)
Comment on lines +14 to +30
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major | ⚡ Quick win

Add a concrete type contract to the new registry API.

_sql_query_registry: dict and register_sql_query(variable_name, query) currently hide the actual Optional[str] -> None contract. Please annotate the registry and this public function explicitly so callers and static checks enforce the same rules as the implementation.

As per coding guidelines, "Always use Optional[T] for parameters that can be None (not T = None)" and "Use explicit type hints for function parameters and return values".

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@deepnote_toolkit/sql/sql_query_chaining.py` around lines 14 - 30, Annotate
the registry and function with explicit types: add "from typing import Optional,
Dict" and change "_sql_query_registry: dict = {}" to "_sql_query_registry:
Dict[str, str] = {}". Update the function signature for register_sql_query to
"def register_sql_query(variable_name: Optional[str], query: Optional[str]) ->
None:" (keeping the implementation and the is_single_select_query call
unchanged) so static checks enforce the Optional[str] inputs and a None return
type.



def add_limit_clause(query: str, limit: int = 100):
class ExecuteSqlError(Exception):
Expand Down Expand Up @@ -137,16 +161,33 @@ def find_query_preview_references(

# Check each table reference
for table_reference in table_references:
# Check if the reference exists in the main module
variable_name = table_reference

# Skip references we've already resolved (dedupe + cycle guard by name)
if variable_name in query_preview_references:
continue

# 1) Dataframe-mode chaining. deepnote-internal registers a block's
# source query keyed by its result variable name. We only trust the
# entry while the variable is still bound in __main__, so a deleted or
# rebound variable doesn't produce a stale CTE expansion.
if variable_name in _sql_query_registry and hasattr(__main__, variable_name):
registered_source = _sql_query_registry[variable_name]
if registered_source:
query_preview_references[variable_name] = registered_source
# Recursively resolve the registered query's own references
find_query_preview_references(
registered_source,
query_preview_references,
processed_queries,
)
continue
Comment on lines +170 to +184
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | 🏗️ Heavy lift

Rebinding the same name still expands stale SQL.

This branch only checks that the name still exists in __main__. If df_1 is rebound to a different object, the old registry entry is still used, so unchaining can execute SQL that no longer matches the notebook state. The registry needs a freshness signal beyond the variable name, or rebinding must clear the entry. Please add a rebinding regression test with the fix.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@deepnote_toolkit/sql/sql_query_chaining.py` around lines 170 - 184, The code
uses _sql_query_registry keyed only by variable_name so rebinding the same name
leaves a stale expansion; change the registry check in
find_query_preview_references to verify the registered entry is still the same
object by comparing a freshness signal (e.g., store and compare the original
object's id or a weakref when the registry is written) before trusting
registered_source, and if the current object's id/weakref no longer matches,
drop the registry entry instead of using it; update the registry-writing logic
wherever entries are created to include the freshness signal (and/or clear
entries on reassignment) and add a regression test that binds df_1 to an object
producing SQL, then rebinds df_1 to a different object and asserts the old SQL
is not expanded.


# 2) Query-preview-mode chaining. The variable itself is a
# DeepnoteQueryPreview that carries its source query.
if hasattr(__main__, table_reference):
variable_name = table_reference
variable = getattr(__main__, table_reference)
# If it's a QueryPreview object and not already in our list
# Use any() with a generator expression to check if the variable is already in the list
# This avoids using the pandas object in a boolean context
if isinstance(variable, DeepnoteQueryPreview) and not any(
id(variable) == id(ref) for ref in query_preview_references
):
if isinstance(variable, DeepnoteQueryPreview):
# Add it to our list
query_preview_source = variable._deepnote_query
query_preview_references[variable_name] = query_preview_source
Expand Down
80 changes: 80 additions & 0 deletions tests/unit/test_sql_query_chaining.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from unittest import TestCase

import __main__
import sqlparse
from sqlparse.tokens import Keyword

from deepnote_toolkit.sql import sql_query_chaining
from deepnote_toolkit.sql.sql_query_chaining import (
add_limit_clause,
extract_table_reference_from_token,
extract_table_references,
find_query_preview_references,
register_sql_query,
unchain_sql_query,
)

Expand Down Expand Up @@ -714,3 +717,80 @@ def test_non_single_select_query_raises_exception(self):
"Invalid query type: Query Preview supports only a single SELECT statement",
str(context.exception),
)


class TestRegistryBasedChaining(TestCase):
"""Tests for dataframe-mode chaining via register_sql_query / the registry.

deepnote-internal registers a SQL block's source query keyed by its result
variable name; downstream blocks then reference the variable as a table and
it is expanded into a CTE - without the result becoming a DeepnoteQueryPreview.
"""

def setUp(self):
sql_query_chaining._sql_query_registry.clear()
self._original_vars = vars(__main__).copy()

def tearDown(self):
sql_query_chaining._sql_query_registry.clear()
for key in list(vars(__main__).keys()):
if key not in self._original_vars:
delattr(__main__, key)

def _bind(self, name, value="bound"):
"""Bind a variable in __main__ (simulates the block having executed)."""
setattr(__main__, name, value)
Comment on lines +730 to +742
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Type-annotate the new test helpers.

The new helper/lifecycle methods should carry explicit -> None annotations, and _bind should type its parameters as well. That keeps this class aligned with the repo rules and fixes Ruff ANN202 on Line 740.

As per coding guidelines, "Use type hints consistently" and "Use explicit type hints for function parameters and return values".

🧰 Tools
🪛 Ruff (0.15.14)

[warning] 740-740: Missing return type annotation for private function _bind

Add return type annotation: None

(ANN202)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tests/unit/test_sql_query_chaining.py` around lines 730 - 742, Add explicit
type annotations to the new test helper and lifecycle methods: annotate setUp
and tearDown with -> None, and annotate _bind as def _bind(self, name: str,
value: typing.Any = "bound") -> None (or use Any imported from typing) to
satisfy ANN202; keep references to sql_query_chaining._sql_query_registry and
__main__ unchanged.


def test_register_stores_single_select(self):
register_sql_query("df_1", "SELECT * FROM users")
self.assertEqual(
sql_query_chaining._sql_query_registry, {"df_1": "SELECT * FROM users"}
)

def test_register_non_select_clears_entry(self):
register_sql_query("df_1", "SELECT * FROM users")
register_sql_query("df_1", "INSERT INTO users VALUES (1)")
self.assertNotIn("df_1", sql_query_chaining._sql_query_registry)

def test_register_none_query_clears_entry(self):
register_sql_query("df_1", "SELECT * FROM users")
register_sql_query("df_1", None)
self.assertNotIn("df_1", sql_query_chaining._sql_query_registry)

def test_find_references_via_registry(self):
self._bind("df_1")
register_sql_query("df_1", "SELECT * FROM users")
refs = find_query_preview_references("SELECT * FROM df_1")
self.assertEqual(refs, {"df_1": "SELECT * FROM users"})

def test_unchain_via_registry(self):
self._bind("df_1")
register_sql_query("df_1", "SELECT * FROM users")
result = unchain_sql_query("SELECT * FROM df_1")
self.assertIn("WITH", result)
self.assertIn("df_1 AS", result)
self.assertIn("SELECT * FROM users", result)

def test_multi_level_registry_chaining(self):
self._bind("df_1")
self._bind("df_2")
register_sql_query("df_1", "SELECT * FROM users")
register_sql_query("df_2", "SELECT * FROM df_1 WHERE active")
result = unchain_sql_query("SELECT * FROM df_2")
# Both levels expanded, with the dependency (df_1) defined before df_2
self.assertIn("df_1 AS", result)
self.assertIn("df_2 AS", result)
self.assertIn("SELECT * FROM users", result)
self.assertLess(result.find("df_1 AS"), result.find("df_2 AS"))

def test_stale_entry_ignored_when_variable_not_bound(self):
# Registered but the variable is no longer bound in __main__ (e.g. deleted
# cell or kernel restart) -> the entry must not produce a CTE.
register_sql_query("df_1", "SELECT * FROM users")
refs = find_query_preview_references("SELECT * FROM df_1")
self.assertEqual(refs, {})

def test_empty_registry_is_safe(self):
refs = find_query_preview_references("SELECT * FROM df_1")
self.assertEqual(refs, {})
self.assertEqual(unchain_sql_query("SELECT * FROM df_1"), "SELECT * FROM df_1")
Loading